{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q xgboost==0.90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r spark_processing_job_s3_output_prefix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Previous Spark Processing Job Name: {}'.format(spark_processing_job_s3_output_prefix))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Specify the S3 Location of the Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix_train = '{}/output/tfidf-train'.format(spark_processing_job_s3_output_prefix)\n",
    "prefix_validation = '{}/output/tfidf-validation'.format(spark_processing_job_s3_output_prefix)\n",
    "prefix_test = '{}/output/tfidf-test'.format(spark_processing_job_s3_output_prefix)\n",
    "\n",
    "tfidf_train_path = './{}'.format(prefix_train)\n",
    "tfidf_validation_path = './{}'.format(prefix_validation)\n",
    "tfidf_test_path = './{}'.format(prefix_test)\n",
    "\n",
    "tfidf_train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)\n",
    "tfidf_validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)\n",
    "tfidf_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)\n",
    "\n",
    "print(tfidf_train_s3_uri)\n",
    "print(tfidf_validation_s3_uri)\n",
    "print(tfidf_test_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3_input_train_data = sagemaker.s3_input(s3_data=tfidf_train_s3_uri, content_type='text/csv')\n",
    "s3_input_validation_data = sagemaker.s3_input(s3_data=tfidf_validation_s3_uri, content_type='text/csv')\n",
    "s3_input_test_data = sagemaker.s3_input(s3_data=tfidf_test_s3_uri, content_type='text/csv')\n",
    "\n",
    "print(s3_input_train_data.config)\n",
    "print(s3_input_validation_data.config)\n",
    "print(s3_input_test_data.config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.amazon.amazon_estimator import get_image_uri \n",
    "import json\n",
    "\n",
    "# get the URI for new container\n",
    "builtin_container_uri = get_image_uri(region_name=region,                                \n",
    "                                      repo_name='xgboost', \n",
    "                                      repo_version='0.90-2')\n",
    "print(builtin_container_uri)\n",
    "\n",
    "model_output_path = 's3://{}/models/built-in/training-runs'.format(bucket)\n",
    "print(model_output_path)\n",
    "\n",
    "xgb_estimator = sagemaker.estimator.Estimator(image_name=builtin_container_uri, \n",
    "                                              role=role, \n",
    "                                              train_instance_count=3,\n",
    "                                              train_instance_type='ml.c5.4xlarge',\n",
    "                                              output_path=model_output_path, \n",
    "                                              sagemaker_session=sess,\n",
    "                                              # enable_cloudwatch_metrics=True\n",
    "                                             )\n",
    "\n",
    "xgb_estimator.set_hyperparameters(objective='binary:logistic',\n",
    "                                  num_round=1,\n",
    "                                  max_depth=5)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_estimator.fit({'train': s3_input_train_data,\n",
    "                   'validation': s3_input_validation_data\n",
    "                  }\n",
    "                  #, wait=False\n",
    "                 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_job_name = xgb_estimator.latest_training_job.name\n",
    "print('training_job_name:  {}'.format(training_job_name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO:  This isn't working at the moment\n",
    "#from sagemaker.xgboost import XGBoost\n",
    "\n",
    "#xgb_estimator = XGBoost.attach(training_job_name=training_job_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download the model artifact from AWS S3\n",
    "!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models/built-in/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tarfile\n",
    "import pickle as pkl\n",
    "\n",
    "tar = tarfile.open('./models/built-in/model.tar.gz')\n",
    "tar.extractall(path='./models/built-in/')\n",
    "tar.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!ls -al ./models/built-in/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pkl\n",
    "import os\n",
    "\n",
    "model_dir = './models/built-in'\n",
    "model_path = os.path.join(model_dir, 'xgboost-model')\n",
    "\n",
    "xgb_estimator_restored = pkl.load(open(model_path, 'rb'))\n",
    "\n",
    "type(xgb_estimator_restored)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "import xgboost\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(12,12))\n",
    "xgboost.plot_importance(xgb_estimator_restored, \n",
    "                        importance_type='gain', \n",
    "                        max_num_features=30, \n",
    "                        height=0.8, \n",
    "                        ax=ax, \n",
    "                        show_values = True)\n",
    "plt.title('Feature Importance')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Calculate Test Metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download Test Dataset and Load into Memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 ls --recursive $balanced_tfidf_without_header_test_s3_uri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!aws s3 cp --recursive $balanced_tfidf_without_header_test_s3_uri $balanced_tfidf_without_header_test_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import pandas as pd\n",
    "\n",
    "def load_dataset(path, sep, header):\n",
    "    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)\n",
    "\n",
    "    labels = data.iloc[:,0]\n",
    "    features = data.drop(data.columns[0], axis=1)\n",
    "    \n",
    "    if header==None:\n",
    "        # Adjust the column names after dropped the 0th column above\n",
    "        # New column names are 0 (inclusive) to len(features.columns) (exclusive)\n",
    "        new_column_names = list(range(0, len(features.columns)))\n",
    "        features.columns = new_column_names\n",
    "\n",
    "    return features, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test, y_test = load_dataset(path=balanced_tfidf_without_header_test_path, sep=',', header=None)\n",
    "X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "X_test.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from xgboost import DMatrix\n",
    "\n",
    "X_test.columns = ['f{}'.format(i) for i in range(0, 300)]\n",
    "\n",
    "# Must convert pandas dataframe to XGBoost DMatrix before predicting\n",
    "preds_test = xgb_estimator_restored.predict(DMatrix(X_test))\n",
    "preds_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert probability values into classification (0 or 1) using threshold 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "preds_test_0_or_1 = np.where(preds_test > 0.5, 1, 0)\n",
    "preds_test_0_or_1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix\n",
    "\n",
    "print('Test Accuracy: ', accuracy_score(y_test, preds_test_0_or_1))\n",
    "print('Test Precision: ', precision_score(y_test, preds_test_0_or_1, average=None))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, preds_test_0_or_1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cm_test = confusion_matrix(y_test, preds_test_0_or_1)\n",
    "df_cm_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import itertools\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):\n",
    "    print(cm)\n",
    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
    "    plt.title(title)\n",
    "    plt.colorbar()\n",
    "    tick_marks = np.arange(len(classes))\n",
    "    plt.xticks(tick_marks, classes, rotation=45)\n",
    "    plt.yticks(tick_marks, classes)\n",
    "\n",
    "    fmt = 'd'\n",
    "    thresh = cm.max() / 2.\n",
    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
    "        plt.text(j, i, format(cm[i, j], fmt),\n",
    "        horizontalalignment=\"center\",\n",
    "        color=\"black\" if cm[i, j] > thresh else \"black\")\n",
    "\n",
    "        plt.tight_layout()\n",
    "        plt.ylabel('True label')\n",
    "        plt.xlabel('Predicted label')\n",
    "\n",
    "# Plot non-normalized confusion matrix\n",
    "plt.figure()\n",
    "fig, ax = plt.subplots(figsize=(10,5))\n",
    "plot_conf_mat(df_cm_test, classes=['Not Positive Sentiment', 'Positive Sentiment'], \n",
    "                          title='Confusion matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "auc = round(metrics.roc_auc_score(y_test, preds_test_0_or_1), 4)\n",
    "print('AUC is ' + repr(auc))\n",
    "\n",
    "fpr, tpr, _ = metrics.roc_curve(y_test, preds_test_0_or_1)\n",
    "\n",
    "plt.title('ROC Curve')\n",
    "plt.plot(fpr, tpr, 'b',\n",
    "label='AUC = %0.2f'% auc)\n",
    "plt.legend(loc='lower right')\n",
    "plt.plot([0,1],[0,1],'r--')\n",
    "plt.xlim([-0.1,1.1])\n",
    "plt.ylim([-0.1,1.1])\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
